# news_scraper/settings.py

# 基本设置
BOT_NAME = 'news_scraper'
SPIDER_MODULES = ['news_scraper.spiders']
NEWSPIDER_MODULE = 'news_scraper.spiders'

# User-Agent设置
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

# Robots.txt设置
ROBOTSTXT_OBEY = False

# 并发和延迟设置
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True

# 爬虫中间件
SPIDER_MIDDLEWARES = {
    'news_scraper.middlewares.NewsScraperSpiderMiddleware': 543,
    'news_scraper.middlewares.ImageProcessingMiddleware': 544,
}

# 下载器中间件
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
    'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 130,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    'news_scraper.middlewares.NewsScraperDownloaderMiddleware': 543,
}

# Item管道
ITEM_PIPELINES = {
    'news_scraper.pipelines.NewsScraperPipeline': 300,
    'news_scraper.pipelines.ImageProcessingPipeline': 301,
}

# 重试设置
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]

# HTTP缓存设置
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 日志设置
LOG_LEVEL = 'INFO'
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
LOG_FILE = 'spider.log'

# 请求和响应设置
DOWNLOAD_TIMEOUT = 30
REDIRECT_ENABLED = True
COOKIES_ENABLED = False

# 图片相关设置
IMAGES_STORE = 'images'
MEDIA_ALLOW_REDIRECTS = True
IMAGES_MIN_HEIGHT = 100
IMAGES_MIN_WIDTH = 100
MAX_IMAGE_SIZE = 5 * 1024 * 1024  # 5MB

# 自定义设置
IMAGES_BASE64_ENABLED = True
CONCURRENT_ITEMS = 100
DEPTH_LIMIT = 3

# 请求头设置
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'Accept-Encoding': 'gzip, deflate',
}

# 自动限速设置
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
AUTOTHROTTLE_DEBUG = False

# 内存使用设置
MEMUSAGE_ENABLED = True
MEMUSAGE_LIMIT_MB = 512
MEMUSAGE_WARNING_MB = 384

# DNS设置
DNS_TIMEOUT = 60

